03. Training and Test¶
Train/validate and test different models for each resampling strategy.
01. Imports and Settings¶
In [1]:
# Imports
from catboost import CatBoostClassifier
from IPython.display import display
from libs.hyper_optimization import (
HyperParamCatBoostClassifier,
HyperParamLightGBMClassifier,
HyperParamXGBoostClassifier,
HyperParamRandomForestClassifier
) # hyper_optimization.py
from libs.model_evaluation import (
plot_roc_curve,
plot_pr_curve,
train_validate_model,
test_model,
plot_feature_importances,
plot_weights
) # model_evaluation.py
from libs.utils import split_X_y, save_object # utils.py
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import pandas as pd
import warnings
# Ignore warnings
warnings.filterwarnings("ignore")
# Pandas settings
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_colwidth", 150) # Increase column width
# Default path
DATA_PATH = "data/"
BIN_PATH = "bin/"
REPORTS_PATH = "reports/"
02. Load Data¶
In [2]:
# Dictionary to save models and their metrics
resampled = dict()
# Initializing for each resampling technique
for strategy in ["ONLY_RUS", "ROS", "SMOTENC", "CTGAN"]:
resampled[strategy] = dict()
resampled[strategy]["data"] = pd.DataFrame()
for clf in ["rf", "xgb", "lgb", "catboost", "xgb_boruta"]: # For each classifier type...
resampled[strategy][clf] = {"obj": object(), "train_metrics": {}, "test_metrics": {}}
In [3]:
# # Load training data from distinct resampling techniques
resampled["ONLY_RUS"]["data"] = pd.read_csv(f"{DATA_PATH}resampling/train_data_ONLY_RUS.csv")
resampled["ROS"]["data"] = pd.read_csv(f"{DATA_PATH}resampling/train_data_ROS.csv")
resampled["SMOTENC"]["data"] = pd.read_csv(f"{DATA_PATH}resampling/train_data_SMOTENC.csv")
resampled["CTGAN"]["data"] = pd.read_csv(f"{DATA_PATH}resampling/train_data_CTGAN.csv")
# Test data
test_df = pd.read_csv(f"{DATA_PATH}test_data.csv")
# Split test data into X and y
X_test, y_test = split_X_y(test_df, "is_target", [])
In [4]:
# Categorial columns
categorical_cols = ["join_s", "sch_s", "sch_r"]
# Numerical columns
numerical_cols = ["adv_r", "data_s", "dist_ch_to_bs", "dist_to_ch",
"expaned_energy", "rank", "send_code", "who_ch"]
features = categorical_cols + numerical_cols
# Selected features in Boruta
boruta_features = ["join_s", "who_ch", "data_s", "expaned_energy", "dist_ch_to_bs", "send_code", "rank", "adv_r"]
03. Train Models¶
03.1. CatBoost¶
In [5]:
for strategy in ["ONLY_RUS", "ROS", "SMOTENC", "CTGAN"]: # For each resampling strategy...
print(f"\n>> {strategy}\n")
# Split into X and y
X_train, y_train = split_X_y(resampled[strategy]["data"], "is_target", [])
# Optimizing hyperparameters...
model, best_hyperparams, _ = HyperParamCatBoostClassifier(
X_train, y_train, n_trials=10).run()
# Cross-Validation with k = 5
resampled[strategy]["catboost"]["train_metrics"] = train_validate_model(model, X_train, y_train)
# Training model...
model = CatBoostClassifier(**best_hyperparams, cat_features=categorical_cols, logging_level="Silent")
model.fit(X_train, y_train)
# Model testing
resampled[strategy]["catboost"]["test_metrics"] = test_model(model, X_test, y_test)
print("\n")
# ROC and PR Curves
plot_roc_curve(model, X_test, y_test)
print("\n")
plot_pr_curve(model, X_test, y_test)
# Save trained model
resampled[strategy]["catboost"]["obj"] = model
save_object(resampled[strategy]["catboost"]["obj"], f"{BIN_PATH}catboost_{strategy}_obj")
print("\n")
# Feature Importance
plot_feature_importances(resampled[strategy]["catboost"]["obj"], features, top_n=len(features))
# Weights
plot_weights(resampled[strategy]["catboost"]["obj"], features, top_n=len(features))
print("\n\n\n")
>> ONLY_RUS
[CatBoostClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '97.1 +- 0.15', 'Recall': '97.15 +- 0.23', 'F1': '97.12 +- 0.18', 'ROC_AUC': '99.9 +- 0.01', 'Log_Loss': '0.0584 +- 0.0031'}
Model Test: {'Precision': "81.04, {'0': 100.0, '1': 67.88, '2': 92.49, '3': 63.79}", 'Recall': "98.23, {'0': 97.25, '1': 99.76, '2': 99.85, '3': 96.07}", 'F1': "88.02, {'0': 98.61, '1': 80.79, '2': 96.03, '3': 76.67}", 'ROC_AUC': '99.91', 'Log_Loss': '0.116'}
| Weight | Feature |
|---|---|
| 0.3014 | expaned_energy |
| 0.1890 | dist_ch_to_bs |
| 0.1755 | adv_r |
| 0.1410 | who_ch |
| 0.0798 | sch_s |
| 0.0459 | rank |
| 0.0284 | data_s |
| 0.0122 | sch_r |
| 0.0102 | join_s |
| 0.0100 | dist_to_ch |
| 0.0066 | send_code |
>> ROS
[CatBoostClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '97.96 +- 0.1', 'Recall': '97.96 +- 0.17', 'F1': '97.95 +- 0.11', 'ROC_AUC': '99.93 +- 0.01', 'Log_Loss': '0.054 +- 0.0031'}
Model Test: {'Precision': "80.74, {'0': 100.0, '1': 66.41, '2': 92.1, '3': 64.46}", 'Recall': "98.46, {'0': 97.1, '1': 99.76, '2': 99.7, '3': 97.28}", 'F1': "87.89, {'0': 98.53, '1': 79.74, '2': 95.75, '3': 77.54}", 'ROC_AUC': '99.9', 'Log_Loss': '0.1295'}
| Weight | Feature |
|---|---|
| 0.3043 | expaned_energy |
| 0.2053 | dist_ch_to_bs |
| 0.1676 | adv_r |
| 0.1395 | who_ch |
| 0.0811 | sch_s |
| 0.0370 | rank |
| 0.0247 | data_s |
| 0.0137 | dist_to_ch |
| 0.0107 | sch_r |
| 0.0083 | join_s |
| 0.0078 | send_code |
>> SMOTENC
[CatBoostClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '97.6 +- 0.14', 'Recall': '97.56 +- 0.28', 'F1': '97.57 +- 0.17', 'ROC_AUC': '99.91 +- 0.01', 'Log_Loss': '0.0598 +- 0.0024'}
Model Test: {'Precision': "77.76, {'0': 100.0, '1': 64.76, '2': 91.69, '3': 54.59}", 'Recall': "98.18, {'0': 96.68, '1': 99.73, '2': 99.35, '3': 96.98}", 'F1': "85.52, {'0': 98.31, '1': 78.53, '2': 95.37, '3': 69.86}", 'ROC_AUC': '99.88', 'Log_Loss': '0.1359'}
| Weight | Feature |
|---|---|
| 0.3071 | expaned_energy |
| 0.2100 | dist_ch_to_bs |
| 0.1622 | adv_r |
| 0.1402 | who_ch |
| 0.0756 | sch_s |
| 0.0385 | rank |
| 0.0245 | data_s |
| 0.0158 | send_code |
| 0.0114 | dist_to_ch |
| 0.0079 | join_s |
| 0.0068 | sch_r |
>> CTGAN
[CatBoostClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '97.52 +- 0.1', 'Recall': '97.35 +- 0.26', 'F1': '97.43 +- 0.18', 'ROC_AUC': '99.91 +- 0.01', 'Log_Loss': '0.0573 +- 0.0026'}
Model Test: {'Precision': "79.57, {'0': 100.0, '1': 70.04, '2': 89.69, '3': 58.57}", 'Recall': "97.98, {'0': 97.24, '1': 99.73, '2': 99.95, '3': 95.02}", 'F1': "86.97, {'0': 98.6, '1': 82.29, '2': 94.54, '3': 72.47}", 'ROC_AUC': '99.9', 'Log_Loss': '0.1152'}
| Weight | Feature |
|---|---|
| 0.2818 | dist_ch_to_bs |
| 0.2470 | expaned_energy |
| 0.1367 | adv_r |
| 0.1075 | who_ch |
| 0.0862 | dist_to_ch |
| 0.0769 | sch_s |
| 0.0263 | rank |
| 0.0197 | data_s |
| 0.0072 | join_s |
| 0.0069 | sch_r |
| 0.0039 | send_code |
03.2. LightGBM¶
In [6]:
for strategy in ["ONLY_RUS", "ROS", "SMOTENC", "CTGAN"]: # For each resampling strategy...
print(f"\n>> {strategy}\n")
# Split into X and y
X_train, y_train = split_X_y(resampled[strategy]["data"], "is_target", [])
# Optimizing hyperparameters...
model, best_hyperparams, _ = HyperParamLightGBMClassifier(
X_train, y_train, n_trials=10).run()
# Cross-Validation with k = 5
resampled[strategy]["lgb"]["train_metrics"] = train_validate_model(model, X_train, y_train)
# Training model...
model = LGBMClassifier(**best_hyperparams, verbosity=-1)
model.fit(X_train, y_train)
# Model testing
resampled[strategy]["lgb"]["test_metrics"] = test_model(model, X_test, y_test)
print("\n")
# ROC and PR Curves
plot_roc_curve(model, X_test, y_test)
print("\n")
plot_pr_curve(model, X_test, y_test)
# Save trained model
resampled[strategy]["lgb"]["obj"] = model
save_object(resampled[strategy]["lgb"]["obj"], f"{BIN_PATH}lgb_{strategy}_obj")
print("\n")
# Feature Importance
plot_feature_importances(resampled[strategy]["lgb"]["obj"], features, top_n=len(features))
# Weights
plot_weights(resampled[strategy]["lgb"]["obj"], features, top_n=len(features))
print("\n\n\n")
>> ONLY_RUS
[LGBMClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '97.75 +- 0.15', 'Recall': '97.9 +- 0.26', 'F1': '97.82 +- 0.19', 'ROC_AUC': '99.94 +- 0.01', 'Log_Loss': '0.0446 +- 0.0037'}
Model Test: {'Precision': "87.38, {'0': 100.0, '1': 67.24, '2': 99.4, '3': 82.9}", 'Recall': "98.28, {'0': 97.78, '1': 99.69, '2': 98.96, '3': 96.68}", 'F1': "91.91, {'0': 98.87, '1': 80.31, '2': 99.18, '3': 89.26}", 'ROC_AUC': '99.94', 'Log_Loss': '0.1032'}
| Weight | Feature |
|---|---|
| 0.4064 | rank |
| 0.3376 | expaned_energy |
| 0.0815 | dist_ch_to_bs |
| 0.0732 | adv_r |
| 0.0299 | sch_s |
| 0.0295 | join_s |
| 0.0257 | who_ch |
| 0.0158 | data_s |
| 0.0002 | sch_r |
| 0.0001 | send_code |
| 0 | dist_to_ch |
>> ROS
[LGBMClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '98.57 +- 0.17', 'Recall': '98.6 +- 0.17', 'F1': '98.58 +- 0.15', 'ROC_AUC': '99.95 +- 0.01', 'Log_Loss': '0.0383 +- 0.0031'}
Model Test: {'Precision': "87.05, {'0': 100.0, '1': 66.71, '2': 99.5, '3': 81.99}", 'Recall': "98.34, {'0': 97.71, '1': 99.76, '2': 98.91, '3': 96.98}", 'F1': "91.71, {'0': 98.84, '1': 79.96, '2': 99.2, '3': 88.86}", 'ROC_AUC': '99.94', 'Log_Loss': '0.1098'}
| Weight | Feature |
|---|---|
| 0.3760 | rank |
| 0.3591 | expaned_energy |
| 0.0865 | dist_ch_to_bs |
| 0.0778 | adv_r |
| 0.0320 | sch_s |
| 0.0284 | who_ch |
| 0.0256 | join_s |
| 0.0144 | data_s |
| 0.0002 | sch_r |
| 0.0000 | send_code |
| 0.0000 | dist_to_ch |
>> SMOTENC
[LGBMClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '98.14 +- 0.11', 'Recall': '98.12 +- 0.2', 'F1': '98.13 +- 0.12', 'ROC_AUC': '99.93 +- 0.01', 'Log_Loss': '0.0468 +- 0.0037'}
Model Test: {'Precision': "84.82, {'0': 100.0, '1': 63.72, '2': 98.81, '3': 76.73}", 'Recall': "98.28, {'0': 97.33, '1': 99.28, '2': 99.4, '3': 97.13}", 'F1': "90.28, {'0': 98.64, '1': 77.62, '2': 99.11, '3': 85.73}", 'ROC_AUC': '99.89', 'Log_Loss': '0.1259'}
| Weight | Feature |
|---|---|
| 0.3771 | rank |
| 0.3544 | expaned_energy |
| 0.0880 | dist_ch_to_bs |
| 0.0780 | adv_r |
| 0.0304 | sch_s |
| 0.0296 | who_ch |
| 0.0277 | join_s |
| 0.0146 | data_s |
| 0.0001 | sch_r |
| 0.0001 | send_code |
| 0 | dist_to_ch |
>> CTGAN
[LGBMClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '98.0 +- 0.13', 'Recall': '98.01 +- 0.2', 'F1': '98.0 +- 0.13', 'ROC_AUC': '99.94 +- 0.01', 'Log_Loss': '0.043 +- 0.0027'}
Model Test: {'Precision': "86.24, {'0': 100.0, '1': 67.28, '2': 99.4, '3': 78.27}", 'Recall': "98.3, {'0': 97.71, '1': 99.69, '2': 98.96, '3': 96.83}", 'F1': "91.23, {'0': 98.84, '1': 80.34, '2': 99.18, '3': 86.56}", 'ROC_AUC': '99.94', 'Log_Loss': '0.1011'}
| Weight | Feature |
|---|---|
| 0.3466 | rank |
| 0.3019 | expaned_energy |
| 0.1515 | dist_ch_to_bs |
| 0.0702 | adv_r |
| 0.0367 | sch_s |
| 0.0347 | who_ch |
| 0.0279 | join_s |
| 0.0153 | data_s |
| 0.0147 | dist_to_ch |
| 0.0003 | sch_r |
| 0.0001 | send_code |
03.3. XGBoost¶
In [7]:
for strategy in ["ONLY_RUS", "ROS", "SMOTENC", "CTGAN"]: # For each resampling strategy...
print(f"\n>> {strategy}\n")
# Split into X and y
X_train, y_train = split_X_y(resampled[strategy]["data"], "is_target", [])
# Optimizing hyperparameters...
model, best_hyperparams, _ = HyperParamXGBoostClassifier(
X_train, y_train, n_trials=10).run()
# Cross-Validation with k = 5
resampled[strategy]["xgb"]["train_metrics"] = train_validate_model(model, X_train, y_train)
# Training model...
model = XGBClassifier(**best_hyperparams)
model.fit(X_train, y_train)
# Model testing
resampled[strategy]["xgb"]["test_metrics"] = test_model(model, X_test, y_test)
print("\n")
# ROC and PR Curves
plot_roc_curve(model, X_test, y_test)
print("\n")
plot_pr_curve(model, X_test, y_test)
# Save trained model
resampled[strategy]["xgb"]["obj"] = model
save_object(resampled[strategy]["xgb"]["obj"], f"{BIN_PATH}xgb_{strategy}_obj")
print("\n")
# Feature Importance
plot_feature_importances(resampled[strategy]["xgb"]["obj"], features, top_n=len(features))
# Weights
plot_weights(resampled[strategy]["xgb"]["obj"], features, top_n=len(features))
print("\n\n\n")
>> ONLY_RUS
[XGBClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '97.65 +- 0.25', 'Recall': '97.73 +- 0.23', 'F1': '97.69 +- 0.23', 'ROC_AUC': '99.93 +- 0.01', 'Log_Loss': '0.0482 +- 0.0039'}
Model Test: {'Precision': "88.16, {'0': 99.99, '1': 67.03, '2': 99.5, '3': 86.12}", 'Recall': "98.22, {'0': 97.8, '1': 99.76, '2': 98.81, '3': 96.53}", 'F1': "92.31, {'0': 98.88, '1': 80.19, '2': 99.15, '3': 91.03}", 'ROC_AUC': '99.94', 'Log_Loss': '0.1068'}
| Weight | Feature |
|---|---|
| 0.9163 | rank |
| 0.0308 | join_s |
| 0.0254 | data_s |
| 0.0105 | sch_s |
| 0.0104 | expaned_energy |
| 0.0030 | dist_ch_to_bs |
| 0.0026 | adv_r |
| 0.0007 | who_ch |
| 0.0004 | sch_r |
| 0 | send_code |
| 0 | dist_to_ch |
>> ROS
[XGBClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '98.53 +- 0.15', 'Recall': '98.56 +- 0.22', 'F1': '98.54 +- 0.18', 'ROC_AUC': '99.95 +- 0.01', 'Log_Loss': '0.0391 +- 0.0044'}
Model Test: {'Precision': "84.96, {'0': 99.99, '1': 66.77, '2': 99.5, '3': 73.59}", 'Recall': "98.05, {'0': 97.6, '1': 99.76, '2': 98.46, '3': 96.37}", 'F1': "90.3, {'0': 98.78, '1': 80.0, '2': 98.97, '3': 83.45}", 'ROC_AUC': '99.94', 'Log_Loss': '0.1141'}
| Weight | Feature |
|---|---|
| 0.9255 | rank |
| 0.0252 | join_s |
| 0.0200 | data_s |
| 0.0114 | sch_s |
| 0.0106 | expaned_energy |
| 0.0032 | dist_ch_to_bs |
| 0.0029 | adv_r |
| 0.0007 | who_ch |
| 0.0005 | sch_r |
| 0 | send_code |
| 0 | dist_to_ch |
>> SMOTENC
[XGBClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '98.06 +- 0.19', 'Recall': '97.97 +- 0.18', 'F1': '98.01 +- 0.14', 'ROC_AUC': '99.93 +- 0.01', 'Log_Loss': '0.0494 +- 0.0031'}
Model Test: {'Precision': "83.75, {'0': 100.0, '1': 65.15, '2': 98.76, '3': 71.08}", 'Recall': "98.31, {'0': 97.38, '1': 99.25, '2': 99.35, '3': 97.28}", 'F1': "89.63, {'0': 98.67, '1': 78.66, '2': 99.06, '3': 82.14}", 'ROC_AUC': '99.87', 'Log_Loss': '0.1342'}
| Weight | Feature |
|---|---|
| 0.9219 | rank |
| 0.0304 | join_s |
| 0.0200 | data_s |
| 0.0107 | sch_s |
| 0.0097 | expaned_energy |
| 0.0033 | dist_ch_to_bs |
| 0.0029 | adv_r |
| 0.0007 | who_ch |
| 0.0003 | sch_r |
| 0.0001 | dist_to_ch |
| 0 | send_code |
>> CTGAN
[XGBClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '97.93 +- 0.14', 'Recall': '97.74 +- 0.21', 'F1': '97.83 +- 0.16', 'ROC_AUC': '99.93 +- 0.01', 'Log_Loss': '0.0471 +- 0.0036'}
Model Test: {'Precision': "87.15, {'0': 99.99, '1': 67.5, '2': 99.45, '3': 81.67}", 'Recall': "98.16, {'0': 97.79, '1': 99.76, '2': 98.86, '3': 96.22}", 'F1': "91.72, {'0': 98.88, '1': 80.52, '2': 99.15, '3': 88.35}", 'ROC_AUC': '99.94', 'Log_Loss': '0.103'}
| Weight | Feature |
|---|---|
| 0.9246 | rank |
| 0.0299 | join_s |
| 0.0189 | data_s |
| 0.0102 | sch_s |
| 0.0079 | expaned_energy |
| 0.0042 | dist_ch_to_bs |
| 0.0024 | adv_r |
| 0.0009 | dist_to_ch |
| 0.0007 | who_ch |
| 0.0003 | sch_r |
| 0.0000 | send_code |
03.4. Random Forest¶
In [8]:
for strategy in ["ONLY_RUS", "ROS", "SMOTENC", "CTGAN"]: # For each resampling strategy...
print(f"\n>> {strategy}\n")
# Split into X and y
X_train, y_train = split_X_y(resampled[strategy]["data"], "is_target", [])
# Optimizing hyperparameters...
model, best_hyperparams, _ = HyperParamRandomForestClassifier(
X_train, y_train, n_trials=10).run()
# Cross-Validation with k = 5
resampled[strategy]["rf"]["train_metrics"] = train_validate_model(model, X_train, y_train)
# Training model...
model = RandomForestClassifier(**best_hyperparams)
model.fit(X_train, y_train)
# Model testing
resampled[strategy]["rf"]["test_metrics"] = test_model(model, X_test, y_test)
print("\n")
# ROC and PR Curves
plot_roc_curve(model, X_test, y_test)
print("\n")
plot_pr_curve(model, X_test, y_test)
# Save trained model
resampled[strategy]["rf"]["obj"] = model
save_object(resampled[strategy]["rf"]["obj"], f"{BIN_PATH}rf_{strategy}_obj")
print("\n")
# Feature Importance
plot_feature_importances(resampled[strategy]["rf"]["obj"], features, top_n=len(features))
# Weights
plot_weights(resampled[strategy]["rf"]["obj"], features, top_n=len(features))
print("\n\n\n")
>> ONLY_RUS
[RandomForestClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '97.5 +- 0.2', 'Recall': '97.09 +- 0.18', 'F1': '97.29 +- 0.16', 'ROC_AUC': '99.87 +- 0.04', 'Log_Loss': '0.0706 +- 0.0115'}
Model Test: {'Precision': "84.22, {'0': 99.99, '1': 69.7, '2': 95.14, '3': 72.06}", 'Recall': "98.22, {'0': 97.69, '1': 99.69, '2': 99.25, '3': 96.22}", 'F1': "90.11, {'0': 98.83, '1': 82.04, '2': 97.15, '3': 82.41}", 'ROC_AUC': '99.6', 'Log_Loss': '0.4912'}
| Weight | Feature |
|---|---|
| 0.5057 ± 0.0039 | rank |
| 0.2932 ± 0.0102 | expaned_energy |
| 0.0753 ± 0.0104 | adv_r |
| 0.0604 ± 0.0034 | dist_ch_to_bs |
| 0.0301 ± 0.0032 | who_ch |
| 0.0186 ± 0.0079 | data_s |
| 0.0149 ± 0.0079 | sch_s |
| 0.0006 ± 0.0005 | sch_r |
| 0.0005 ± 0.0007 | send_code |
| 0.0005 ± 0.0007 | join_s |
| 0.0003 ± 0.0006 | dist_to_ch |
>> ROS
[RandomForestClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '98.29 +- 0.14', 'Recall': '98.09 +- 0.26', 'F1': '98.18 +- 0.18', 'ROC_AUC': '99.91 +- 0.02', 'Log_Loss': '0.0586 +- 0.0094'}
Model Test: {'Precision': "83.36, {'0': 100.0, '1': 67.76, '2': 94.95, '3': 70.75}", 'Recall': "98.31, {'0': 97.48, '1': 99.73, '2': 99.2, '3': 96.83}", 'F1': "89.55, {'0': 98.72, '1': 80.69, '2': 97.03, '3': 81.76}", 'ROC_AUC': '99.58', 'Log_Loss': '0.5211'}
| Weight | Feature |
|---|---|
| 0.4612 ± 0.0042 | rank |
| 0.3071 ± 0.0204 | expaned_energy |
| 0.0995 ± 0.0210 | adv_r |
| 0.0664 ± 0.0035 | dist_ch_to_bs |
| 0.0311 ± 0.0035 | who_ch |
| 0.0206 ± 0.0133 | sch_s |
| 0.0124 ± 0.0119 | data_s |
| 0.0007 ± 0.0004 | sch_r |
| 0.0004 ± 0.0007 | send_code |
| 0.0003 ± 0.0005 | dist_to_ch |
| 0.0003 ± 0.0005 | join_s |
>> SMOTENC
[RandomForestClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '97.93 +- 0.11', 'Recall': '97.64 +- 0.2', 'F1': '97.78 +- 0.08', 'ROC_AUC': '99.88 +- 0.02', 'Log_Loss': '0.0725 +- 0.0083'}
Model Test: {'Precision': "79.06, {'0': 100.0, '1': 64.8, '2': 93.47, '3': 57.98}", 'Recall': "98.18, {'0': 96.86, '1': 99.73, '2': 99.0, '3': 97.13}", 'F1': "86.43, {'0': 98.4, '1': 78.56, '2': 96.16, '3': 72.61}", 'ROC_AUC': '99.53', 'Log_Loss': '0.5274'}
| Weight | Feature |
|---|---|
| 0.4611 ± 0.0039 | rank |
| 0.3058 ± 0.0218 | expaned_energy |
| 0.0939 ± 0.0215 | adv_r |
| 0.0686 ± 0.0044 | dist_ch_to_bs |
| 0.0366 ± 0.0057 | who_ch |
| 0.0169 ± 0.0035 | data_s |
| 0.0157 ± 0.0052 | sch_s |
| 0.0004 ± 0.0007 | join_s |
| 0.0004 ± 0.0006 | dist_to_ch |
| 0.0003 ± 0.0003 | sch_r |
| 0.0003 ± 0.0005 | send_code |
>> CTGAN
[RandomForestClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '97.46 +- 0.15', 'Recall': '97.13 +- 0.3', 'F1': '97.29 +- 0.19', 'ROC_AUC': '99.87 +- 0.02', 'Log_Loss': '0.0697 +- 0.0071'}
Model Test: {'Precision': "82.7, {'0': 99.99, '1': 67.19, '2': 94.52, '3': 69.09}", 'Recall': "97.9, {'0': 97.39, '1': 99.76, '2': 99.6, '3': 94.86}", 'F1': "88.98, {'0': 98.68, '1': 80.3, '2': 97.0, '3': 79.95}", 'ROC_AUC': '99.7', 'Log_Loss': '0.487'}
| Weight | Feature |
|---|---|
| 0.4627 ± 0.0035 | rank |
| 0.2551 ± 0.0231 | expaned_energy |
| 0.1174 ± 0.0196 | dist_ch_to_bs |
| 0.0786 ± 0.0034 | adv_r |
| 0.0293 ± 0.0114 | sch_s |
| 0.0277 ± 0.0028 | who_ch |
| 0.0161 ± 0.0073 | data_s |
| 0.0119 ± 0.0039 | dist_to_ch |
| 0.0005 ± 0.0004 | sch_r |
| 0.0003 ± 0.0005 | send_code |
| 0.0003 ± 0.0006 | join_s |
03.5. XGBoost-Boruta¶
In [9]:
for strategy in ["ONLY_RUS", "ROS", "SMOTENC", "CTGAN"]: # For each resampling strategy...
print(f"\n>> {strategy}\n")
# Split into X and y
# Select only Boruta features
X_train, y_train = split_X_y(resampled[strategy]["data"][boruta_features + ["is_target"]], "is_target", [])
# Optimizing hyperparameters...
model, best_hyperparams, _ = HyperParamXGBoostClassifier(
X_train, y_train, n_trials=10).run()
# Cross-Validation with k = 5
resampled[strategy]["xgb_boruta"]["train_metrics"] = train_validate_model(model, X_train, y_train)
# Training model...
model = XGBClassifier(**best_hyperparams, verbosity=0)
model.fit(X_train, y_train)
# Model testing
# Only Boruta features
resampled[strategy]["xgb_boruta"]["test_metrics"] = test_model(model, X_test[boruta_features], y_test)
print("\n")
# ROC and PR Curves
plot_roc_curve(model, X_test[boruta_features], y_test)
print("\n")
plot_pr_curve(model, X_test[boruta_features], y_test)
# Save trained model
resampled[strategy]["xgb_boruta"]["obj"] = model
save_object(resampled[strategy]["xgb_boruta"]["obj"], f"{BIN_PATH}xgb_boruta_{strategy}_obj")
print("\n")
# Feature Importance
plot_feature_importances(resampled[strategy]["xgb_boruta"]["obj"], boruta_features, top_n=len(boruta_features))
# Weights
plot_weights(resampled[strategy]["xgb_boruta"]["obj"], boruta_features, top_n=len(boruta_features))
print("\n\n\n")
>> ONLY_RUS
[XGBClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '97.34 +- 0.22', 'Recall': '97.5 +- 0.19', 'F1': '97.42 +- 0.18', 'ROC_AUC': '99.92 +- 0.01', 'Log_Loss': '0.0521 +- 0.0038'}
Model Test: {'Precision': "86.33, {'0': 100.0, '1': 65.23, '2': 99.55, '3': 80.56}", 'Recall': "97.98, {'0': 97.57, '1': 99.73, '2': 98.26, '3': 96.37}", 'F1': "91.07, {'0': 98.77, '1': 78.87, '2': 98.9, '3': 87.76}", 'ROC_AUC': '99.93', 'Log_Loss': '0.122'}
| Weight | Feature |
|---|---|
| 0.9250 | rank |
| 0.0309 | data_s |
| 0.0267 | join_s |
| 0.0101 | expaned_energy |
| 0.0033 | adv_r |
| 0.0032 | dist_ch_to_bs |
| 0.0008 | who_ch |
| 0 | send_code |
>> ROS
[XGBClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '98.38 +- 0.2', 'Recall': '98.49 +- 0.25', 'F1': '98.43 +- 0.21', 'ROC_AUC': '99.94 +- 0.01', 'Log_Loss': '0.0417 +- 0.0038'}
Model Test: {'Precision': "85.53, {'0': 100.0, '1': 65.58, '2': 99.45, '3': 77.09}", 'Recall': "98.04, {'0': 97.54, '1': 99.73, '2': 98.81, '3': 96.07}", 'F1': "90.64, {'0': 98.75, '1': 79.12, '2': 99.13, '3': 85.54}", 'ROC_AUC': '99.93', 'Log_Loss': '0.1302'}
| Weight | Feature |
|---|---|
| 0.9329 | rank |
| 0.0267 | data_s |
| 0.0219 | join_s |
| 0.0106 | expaned_energy |
| 0.0036 | adv_r |
| 0.0035 | dist_ch_to_bs |
| 0.0008 | who_ch |
| 0 | send_code |
>> SMOTENC
[XGBClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '98.03 +- 0.17', 'Recall': '97.92 +- 0.19', 'F1': '97.97 +- 0.15', 'ROC_AUC': '99.92 +- 0.01', 'Log_Loss': '0.0524 +- 0.0028'}
Model Test: {'Precision': "84.86, {'0': 100.0, '1': 64.62, '2': 98.8, '3': 76.01}", 'Recall': "97.94, {'0': 97.44, '1': 99.25, '2': 98.41, '3': 96.68}", 'F1': "90.17, {'0': 98.7, '1': 78.28, '2': 98.6, '3': 85.11}", 'ROC_AUC': '99.87', 'Log_Loss': '0.1385'}
| Weight | Feature |
|---|---|
| 0.9266 | rank |
| 0.0279 | join_s |
| 0.0271 | data_s |
| 0.0101 | expaned_energy |
| 0.0039 | adv_r |
| 0.0035 | dist_ch_to_bs |
| 0.0009 | who_ch |
| 0 | send_code |
>> CTGAN
[XGBClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '97.56 +- 0.1', 'Recall': '97.34 +- 0.15', 'F1': '97.45 +- 0.11', 'ROC_AUC': '99.91 +- 0.01', 'Log_Loss': '0.0553 +- 0.0031'}
Model Test: {'Precision': "83.86, {'0': 100.0, '1': 66.34, '2': 99.35, '3': 69.76}", 'Recall': "98.15, {'0': 97.48, '1': 99.59, '2': 99.0, '3': 96.53}", 'F1': "89.63, {'0': 98.72, '1': 79.63, '2': 99.18, '3': 80.99}", 'ROC_AUC': '99.92', 'Log_Loss': '0.1301'}
| Weight | Feature |
|---|---|
| 0.9269 | rank |
| 0.0290 | data_s |
| 0.0281 | join_s |
| 0.0076 | expaned_energy |
| 0.0045 | dist_ch_to_bs |
| 0.0030 | adv_r |
| 0.0009 | who_ch |
| 0 | send_code |
04. Synthesized Results¶
In [10]:
results = [] # List to save results
for strategy, models in resampled.items(): # Iterate on strategies and models
for model_name, model_metrics in models.items():
if model_name not in ["data"]:
# Add the metrics dictionaries
results.append({"strategy": strategy, "model": model_name.upper(),
"train_metrics": model_metrics.get("train_metrics", {}),
"test_metrics": model_metrics.get("test_metrics", {})})
# DataFrame from list
results_df = pd.DataFrame(results)
# Results...
display(results_df)
| strategy | model | train_metrics | test_metrics | |
|---|---|---|---|---|
| 0 | ONLY_RUS | RF | {'Precision': '97.5 +- 0.2', 'Recall': '97.09 +- 0.18', 'F1': '97.29 +- 0.16', 'ROC_AUC': '99.87 +- 0.04', 'Log_Loss': '0.0706 +- 0.0115'} | {'Precision': '84.22, {'0': 99.99, '1': 69.7, '2': 95.14, '3': 72.06}', 'Recall': '98.22, {'0': 97.69, '1': 99.69, '2': 99.25, '3': 96.22}', 'F1':... |
| 1 | ONLY_RUS | XGB | {'Precision': '97.65 +- 0.25', 'Recall': '97.73 +- 0.23', 'F1': '97.69 +- 0.23', 'ROC_AUC': '99.93 +- 0.01', 'Log_Loss': '0.0482 +- 0.0039'} | {'Precision': '88.16, {'0': 99.99, '1': 67.03, '2': 99.5, '3': 86.12}', 'Recall': '98.22, {'0': 97.8, '1': 99.76, '2': 98.81, '3': 96.53}', 'F1': ... |
| 2 | ONLY_RUS | LGB | {'Precision': '97.75 +- 0.15', 'Recall': '97.9 +- 0.26', 'F1': '97.82 +- 0.19', 'ROC_AUC': '99.94 +- 0.01', 'Log_Loss': '0.0446 +- 0.0037'} | {'Precision': '87.38, {'0': 100.0, '1': 67.24, '2': 99.4, '3': 82.9}', 'Recall': '98.28, {'0': 97.78, '1': 99.69, '2': 98.96, '3': 96.68}', 'F1': ... |
| 3 | ONLY_RUS | CATBOOST | {'Precision': '97.1 +- 0.15', 'Recall': '97.15 +- 0.23', 'F1': '97.12 +- 0.18', 'ROC_AUC': '99.9 +- 0.01', 'Log_Loss': '0.0584 +- 0.0031'} | {'Precision': '81.04, {'0': 100.0, '1': 67.88, '2': 92.49, '3': 63.79}', 'Recall': '98.23, {'0': 97.25, '1': 99.76, '2': 99.85, '3': 96.07}', 'F1'... |
| 4 | ONLY_RUS | XGB_BORUTA | {'Precision': '97.34 +- 0.22', 'Recall': '97.5 +- 0.19', 'F1': '97.42 +- 0.18', 'ROC_AUC': '99.92 +- 0.01', 'Log_Loss': '0.0521 +- 0.0038'} | {'Precision': '86.33, {'0': 100.0, '1': 65.23, '2': 99.55, '3': 80.56}', 'Recall': '97.98, {'0': 97.57, '1': 99.73, '2': 98.26, '3': 96.37}', 'F1'... |
| 5 | ROS | RF | {'Precision': '98.29 +- 0.14', 'Recall': '98.09 +- 0.26', 'F1': '98.18 +- 0.18', 'ROC_AUC': '99.91 +- 0.02', 'Log_Loss': '0.0586 +- 0.0094'} | {'Precision': '83.36, {'0': 100.0, '1': 67.76, '2': 94.95, '3': 70.75}', 'Recall': '98.31, {'0': 97.48, '1': 99.73, '2': 99.2, '3': 96.83}', 'F1':... |
| 6 | ROS | XGB | {'Precision': '98.53 +- 0.15', 'Recall': '98.56 +- 0.22', 'F1': '98.54 +- 0.18', 'ROC_AUC': '99.95 +- 0.01', 'Log_Loss': '0.0391 +- 0.0044'} | {'Precision': '84.96, {'0': 99.99, '1': 66.77, '2': 99.5, '3': 73.59}', 'Recall': '98.05, {'0': 97.6, '1': 99.76, '2': 98.46, '3': 96.37}', 'F1': ... |
| 7 | ROS | LGB | {'Precision': '98.57 +- 0.17', 'Recall': '98.6 +- 0.17', 'F1': '98.58 +- 0.15', 'ROC_AUC': '99.95 +- 0.01', 'Log_Loss': '0.0383 +- 0.0031'} | {'Precision': '87.05, {'0': 100.0, '1': 66.71, '2': 99.5, '3': 81.99}', 'Recall': '98.34, {'0': 97.71, '1': 99.76, '2': 98.91, '3': 96.98}', 'F1':... |
| 8 | ROS | CATBOOST | {'Precision': '97.96 +- 0.1', 'Recall': '97.96 +- 0.17', 'F1': '97.95 +- 0.11', 'ROC_AUC': '99.93 +- 0.01', 'Log_Loss': '0.054 +- 0.0031'} | {'Precision': '80.74, {'0': 100.0, '1': 66.41, '2': 92.1, '3': 64.46}', 'Recall': '98.46, {'0': 97.1, '1': 99.76, '2': 99.7, '3': 97.28}', 'F1': '... |
| 9 | ROS | XGB_BORUTA | {'Precision': '98.38 +- 0.2', 'Recall': '98.49 +- 0.25', 'F1': '98.43 +- 0.21', 'ROC_AUC': '99.94 +- 0.01', 'Log_Loss': '0.0417 +- 0.0038'} | {'Precision': '85.53, {'0': 100.0, '1': 65.58, '2': 99.45, '3': 77.09}', 'Recall': '98.04, {'0': 97.54, '1': 99.73, '2': 98.81, '3': 96.07}', 'F1'... |
| 10 | SMOTENC | RF | {'Precision': '97.93 +- 0.11', 'Recall': '97.64 +- 0.2', 'F1': '97.78 +- 0.08', 'ROC_AUC': '99.88 +- 0.02', 'Log_Loss': '0.0725 +- 0.0083'} | {'Precision': '79.06, {'0': 100.0, '1': 64.8, '2': 93.47, '3': 57.98}', 'Recall': '98.18, {'0': 96.86, '1': 99.73, '2': 99.0, '3': 97.13}', 'F1': ... |
| 11 | SMOTENC | XGB | {'Precision': '98.06 +- 0.19', 'Recall': '97.97 +- 0.18', 'F1': '98.01 +- 0.14', 'ROC_AUC': '99.93 +- 0.01', 'Log_Loss': '0.0494 +- 0.0031'} | {'Precision': '83.75, {'0': 100.0, '1': 65.15, '2': 98.76, '3': 71.08}', 'Recall': '98.31, {'0': 97.38, '1': 99.25, '2': 99.35, '3': 97.28}', 'F1'... |
| 12 | SMOTENC | LGB | {'Precision': '98.14 +- 0.11', 'Recall': '98.12 +- 0.2', 'F1': '98.13 +- 0.12', 'ROC_AUC': '99.93 +- 0.01', 'Log_Loss': '0.0468 +- 0.0037'} | {'Precision': '84.82, {'0': 100.0, '1': 63.72, '2': 98.81, '3': 76.73}', 'Recall': '98.28, {'0': 97.33, '1': 99.28, '2': 99.4, '3': 97.13}', 'F1':... |
| 13 | SMOTENC | CATBOOST | {'Precision': '97.6 +- 0.14', 'Recall': '97.56 +- 0.28', 'F1': '97.57 +- 0.17', 'ROC_AUC': '99.91 +- 0.01', 'Log_Loss': '0.0598 +- 0.0024'} | {'Precision': '77.76, {'0': 100.0, '1': 64.76, '2': 91.69, '3': 54.59}', 'Recall': '98.18, {'0': 96.68, '1': 99.73, '2': 99.35, '3': 96.98}', 'F1'... |
| 14 | SMOTENC | XGB_BORUTA | {'Precision': '98.03 +- 0.17', 'Recall': '97.92 +- 0.19', 'F1': '97.97 +- 0.15', 'ROC_AUC': '99.92 +- 0.01', 'Log_Loss': '0.0524 +- 0.0028'} | {'Precision': '84.86, {'0': 100.0, '1': 64.62, '2': 98.8, '3': 76.01}', 'Recall': '97.94, {'0': 97.44, '1': 99.25, '2': 98.41, '3': 96.68}', 'F1':... |
| 15 | CTGAN | RF | {'Precision': '97.46 +- 0.15', 'Recall': '97.13 +- 0.3', 'F1': '97.29 +- 0.19', 'ROC_AUC': '99.87 +- 0.02', 'Log_Loss': '0.0697 +- 0.0071'} | {'Precision': '82.7, {'0': 99.99, '1': 67.19, '2': 94.52, '3': 69.09}', 'Recall': '97.9, {'0': 97.39, '1': 99.76, '2': 99.6, '3': 94.86}', 'F1': '... |
| 16 | CTGAN | XGB | {'Precision': '97.93 +- 0.14', 'Recall': '97.74 +- 0.21', 'F1': '97.83 +- 0.16', 'ROC_AUC': '99.93 +- 0.01', 'Log_Loss': '0.0471 +- 0.0036'} | {'Precision': '87.15, {'0': 99.99, '1': 67.5, '2': 99.45, '3': 81.67}', 'Recall': '98.16, {'0': 97.79, '1': 99.76, '2': 98.86, '3': 96.22}', 'F1':... |
| 17 | CTGAN | LGB | {'Precision': '98.0 +- 0.13', 'Recall': '98.01 +- 0.2', 'F1': '98.0 +- 0.13', 'ROC_AUC': '99.94 +- 0.01', 'Log_Loss': '0.043 +- 0.0027'} | {'Precision': '86.24, {'0': 100.0, '1': 67.28, '2': 99.4, '3': 78.27}', 'Recall': '98.3, {'0': 97.71, '1': 99.69, '2': 98.96, '3': 96.83}', 'F1': ... |
| 18 | CTGAN | CATBOOST | {'Precision': '97.52 +- 0.1', 'Recall': '97.35 +- 0.26', 'F1': '97.43 +- 0.18', 'ROC_AUC': '99.91 +- 0.01', 'Log_Loss': '0.0573 +- 0.0026'} | {'Precision': '79.57, {'0': 100.0, '1': 70.04, '2': 89.69, '3': 58.57}', 'Recall': '97.98, {'0': 97.24, '1': 99.73, '2': 99.95, '3': 95.02}', 'F1'... |
| 19 | CTGAN | XGB_BORUTA | {'Precision': '97.56 +- 0.1', 'Recall': '97.34 +- 0.15', 'F1': '97.45 +- 0.11', 'ROC_AUC': '99.91 +- 0.01', 'Log_Loss': '0.0553 +- 0.0031'} | {'Precision': '83.86, {'0': 100.0, '1': 66.34, '2': 99.35, '3': 69.76}', 'Recall': '98.15, {'0': 97.48, '1': 99.59, '2': 99.0, '3': 96.53}', 'F1':... |
In [11]:
# Save results DataFrame
results_df.to_csv(f"{REPORTS_PATH}training_results.csv", index=False)
In [ ]: